***** Is Europe an optimal Political Area?                *****
***** Alberto Alesina, Guido Tabellini & Francesco Trebbi *****

***** Section IV.ab

*Set up 
clear all
set mem 10g
set matsize 11000
set more off

* Set output directory
cap mkdir "section4"
cap mkdir "appendix"

use "data/EVS_GSS_final.dta", clear

* EU countries excluding Eastern Europe, no Germany split - We exclude US from this analysis as we want to compute distances only for EU countries
keep if country=="AT"| country=="BE"| country=="DE"| country=="DK"| country=="ES"| country=="FI"| country=="FR"| country=="GR"| country=="IE"| country=="IT"| country=="LU"| country=="NL"| country=="NO"| country=="PT"| country=="SE"| country=="GB-GBN" | country=="TR"

** Tab some covariates to use
qui tab marital_status, gen(X_marital_status)
qui tab education, gen(X_education)
qui tab employment, gen(X_employment)
qui tab income_recoded, gen(X_income_recoded)
qui tab size_of_town, gen(X_size_of_town)
qui tab education1, gen(X_education_v)

** Setting waves
gen wave=.
replace wave=1 if year==1981 | year==1982 | year==1983 | year==1984 | year==1986
replace wave=2 if year==1990 | year==1991 | year==1993
replace wave=3 if year==1998 | year==1999 | year==2000
replace wave=4 if year==2006 | year==2008 | year==2009 | year==2010 
drop if wave==.


** Cultural Variables 
global ecult2 Y_obedience Y_trust Y_ideology Y_religious Y_divorce Y_euthan Y_suicide Y_altruism Y_hardwork Y_redistrib Y_workingfemale Y_careerfemale Y_preschoolmother Y_private Y_equalize Y_control Y_gay Y_god Y_abortion
* Covariates
global ecov2 X_age X_sex X_marital_status* X_employment1 X_employment2 X_employment3 X_employment4 X_employment5 X_employment6 X_income_recoded1 X_income_recoded2 X_income_recoded3 X_size_of_town* X_education1 X_education2 X_education3 X_education4 X_education5 X_education6

** Set unit of analysis: area2 - EU countries and US 5 macroregions
drop country
gen country=area2

** set the list of variable we are interested in
global cult $ecult2 
global cov $ecov2
global labellist "Full"

********************************************************************************
program gaussian_kernel
	#delimit ;
	syntax varlist ,
	[
	idvar(varname)
	
	genfile(string) name1(name) name2(name) scorevar(name)
	]
	;
	#delimit cr
	*
	save rtmp.dta, replace
	keep `idvar' `varlist' 
	/*Set kernel bandwidth equal to the number of dimensions following 
	Hainmuller and Hazlett KRLS Package:
	"By default, the bandwidth is set equal to D (the number of dimensions) which 
	typically yields a reasonable scaling of the distances between observations 
	in the standardized data that is used for the fitting."
	*/
	tempname sigmasq
	qui describe
	scalar `sigmasq' = r(k) - 1
	tempname alpha
	scalar `alpha' = 1/(2*`sigmasq')
	gen const = 1
	preserve
		sort const
		foreach var of varlist `idvar' `varlist'  {
			rename `var' `var'2
		}
		save tmp0.dta, replace
	restore
	*
	sort const
	joinby const using tmp0.dta
	drop const
	foreach var of varlist `varlist' {
		gen double `var'__d = `var' - `var'2
		replace `var'__d = `var'__d^2
	}
	*Euclidean distance of the vectors
	egen double ed = rsum(*__d)
	replace ed = sqrt(ed)
	gen double gk = `alpha'*ed^2
	*get 1-gaussian kernel measure of distance 0= same vectors, 1= maximally distant vectors.
	replace gk = 1-exp(-gk)
	rename gk `scorevar'
	rename `idvar' `name1'
	rename `idvar'2 `name2'
	keep `name1' `name2' `scorevar' 
	save `genfile', replace
	use rtmp.dta, clear
end
********************************************************************************
program cosine
	#delimit ;
	syntax varlist ,
	[
	idvar(varname)
	
	genfile(string) name1(name) name2(name) scorevar(name)
	]
	;
	#delimit cr
	*
	save rtmp.dta, replace
	keep `idvar' `varlist' 
	/**/
	gen const = 1
	preserve
		sort const
		foreach var of varlist `idvar' `varlist'  {
			rename `var' `var'2
		}
		save tmp0.dta, replace
	restore
	*
	sort const
	joinby const using tmp0.dta
	drop const
	foreach var of varlist `varlist' {
		gen double `var'__d = `var'*`var'2
		gen double `var'__2 = `var'^2
		gen double `var'__22= `var'2^2
	}
	
	*Cosine
	egen double ed = rsum(*__d)
	egen double ed2 = rsum(*__2)
	replace ed2 = sqrt(ed2)	
	egen double ed22 = rsum(*__22)
	replace ed22 = sqrt(ed22)
	gen double gk = ed/(ed2*ed22)
	*get cosine measure of distance 0= same vectors, 1= maximally distant vectors.
	replace gk = acos(gk)/_pi
	rename gk `scorevar'
	rename `idvar' `name1'
	rename `idvar'2 `name2'
	keep `name1' `name2' `scorevar' 
	save `genfile', replace
	use rtmp.dta, clear
end
********************************************************************************

********************************************************************************
**We obtain N people at random from every country-wave
global N = 250
set seed 364011739
	
* avoid missing values on the basic covariates
foreach var of varlist $cov $cult {
	drop if missing(`var')
}
gen random = runiform() 
bysort country wave (random): keep if _n<=$N

* standardize all variables for equal weighting in distance measures below
foreach var of varlist $cov $cult {
	qui sum `var'
	replace `var' = (`var'-r(mean))/r(sd)
}


** Some utility files for below
preserve
keep persno country wave latitude longitude CapitalLatitude CapitalLongitude
rename persno ROW 
rename country country_row
rename latitude latitude_row
rename longitude longitude_row
rename CapitalLatitude CapitalLatitude_row
rename CapitalLongitude CapitalLongitude_row
sort ROW
save "tmp1r.dta", replace
restore

preserve
keep persno country wave latitude longitude CapitalLatitude CapitalLongitude
rename persno COL 
rename country country_col
rename latitude latitude_col
rename longitude longitude_col
rename CapitalLatitude CapitalLatitude_col
rename CapitalLongitude CapitalLongitude_col
sort COL
save "tmp1c.dta", replace
restore

***** Regress cultural variables on covariates and store residuals
global i = 1
foreach var of varlist $cult {
reg `var' $cov
if $i == 1 {
}
else if $i != 1{
}
global i = $i + 1
predict res_`var', resid
}

** Generate distance 
forvalues i=4(1)4 {
preserve
	keep if wave==`i'
	global wave = `i'
	*drop variable if no variance in the sample -messes up inversions as distance is always 0:
	foreach var in $cov $cult {
		qui sum `var'
		global S = r(sd)
		if $S == 0 {
			drop `var'
		}
	}

*x basic socio-economic covariates set
qui gaussian_kernel $cov, idvar(persno) genfile(tmp_x`i') scorevar(cov_mdist) name1(ROW) name2(COL)

*X extended socio-economic covariates set, only for waves 2 to 4
if $wave ==2 | $wave ==3 | $wave ==4 {
qui gaussian_kernel $ecov2, idvar(persno) genfile(tmp_eX`i') scorevar(ecov_mdist) name1(ROW) name2(COL)
}
*y basic culture
qui gaussian_kernel $cult, idvar(persno) genfile(tmp_y`i') scorevar(cult_mdist) name1(ROW) name2(COL)

*Y extended culture
qui gaussian_kernel $ecult2, idvar(persno) genfile(tmp_eY`i') scorevar(ecult_mdist) name1(ROW) name2(COL)

*residuals of y basic culture on covariates 
qui gaussian_kernel res_*, idvar(persno) genfile(tmp_resy`i') scorevar(res_cult_mdist) name1(ROW) name2(COL)

*trust, ideology, and residuals on covariates
qui gaussian_kernel Y_trust, idvar(persno) genfile(tmp_xt`i') scorevar(Y_trust_mdist) name1(ROW) name2(COL)
qui gaussian_kernel Y_ideology, idvar(persno) genfile(tmp_xi`i') scorevar(Y_ideology_mdist) name1(ROW) name2(COL)
qui gaussian_kernel res_Y_trust, idvar(persno) genfile(tmp_resyt`i') scorevar(res_Y_trust_mdist) name1(ROW) name2(COL)
qui gaussian_kernel res_Y_ideology, idvar(persno) genfile(tmp_resyi`i') scorevar(res_Y_ideology_mdist) name1(ROW) name2(COL)

restore
}

** Change the matrices of distances into data
forvalues i=4(1)4 {
global wave = `i'
*
use "tmp_x`i'.dta", clear
gen wave =`i'
sort ROW COL
save "tmp_x`i'.dta", replace
*
if $wave == 2| $wave ==3 | $wave ==4 {
	use "tmp_eX`i'.dta", clear
	gen wave =`i'
	sort ROW COL
	save "tmp_eX`i'.dta", replace
}

use "tmp_y`i'.dta", clear
gen wave =`i'
sort ROW COL
save "tmp_y`i'.dta", replace

use "tmp_eY`i'.dta", clear
gen wave =`i'
sort ROW COL
save "tmp_eY`i'.dta", replace

use "tmp_resy`i'.dta", clear
gen wave =`i'
sort ROW COL
save "tmp_resy`i'.dta", replace

use "tmp_xt`i'.dta", clear
gen wave =`i'
sort ROW COL
save "tmp_xt`i'.dta", replace

use "tmp_xi`i'.dta", clear
gen wave =`i'
sort ROW COL
save "tmp_xi`i'.dta", replace

use "tmp_resyt`i'.dta", clear
gen wave =`i'
sort ROW COL
save "tmp_resyt`i'.dta", replace

use "tmp_resyi`i'.dta", clear
gen wave =`i'
sort ROW COL
save "tmp_resyi`i'.dta", replace


}

use "tmp_x4.dta", clear
sort ROW COL
save "tmp_x.dta", replace
*
use "tmp_xt4.dta", clear
sort ROW COL
save "tmp_xt.dta", replace
*
use "tmp_xi4.dta", clear
sort ROW COL
save "tmp_xi.dta", replace
*
use "tmp_eX4.dta", clear
sort ROW COL
save "tmp_eX.dta", replace
*
use "tmp_y4.dta", clear
sort ROW COL
save "tmp_y.dta", replace
*
use "tmp_eY4.dta", clear
sort ROW COL
save "tmp_eY.dta", replace
*
use "tmp_resy4.dta", clear
sort ROW COL
save "tmp_resy.dta", replace
*
use "tmp_resyt4.dta", clear
sort ROW COL
save "tmp_resyt.dta", replace
*
use "tmp_resyi4.dta", clear
sort ROW COL
save "tmp_resyi.dta", replace
*
use "tmp_x.dta", clear
merge ROW COL using "tmp_eX.dta" 
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp_y.dta"
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp_eY.dta"
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp_resy.dta"
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp_xt.dta"
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp_xi.dta"
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp_resyt.dta"
tab _m
drop _m
sort ROW COL
merge ROW COL using "tmp_resyi.dta"
tab _m
drop _m
*
label var cov_mdist "Dist. in Covariates"
label var Y_trust_mdist "Dist. in Trust"
label var Y_ideology_mdist "Dist. in Ideology"
label var ecov_mdist "Dist. in Ext. Covariates"
label var cult_mdist "Dist. in Culture"
label var ecult_mdist "Dist. in Ext. Culture"
label var res_cult_mdist "Dist. in Cultural residuals"
label var res_Y_trust_mdist "Dist. in Trust residuals"
label var res_Y_ideology_mdist "Dist. in Ideology residuals"


** Identifying countries
sort ROW
merge ROW using "tmp1r.dta"
keep if _m~=2
drop _m
sort COL
merge COL using "tmp1c.dta"
keep if _m~=2
drop _m
gen same = (country_row==country_col)
*Keeping unique correlations
* drop diagonal 
drop if ROW==COL
*keep only lower triangular
isid ROW COL
sort ROW COL
gen id1 = _n
sort COL ROW
gen id2 = _n 
rowsort id1 id2, gen(id3 id4)
keep if id1==id3&id2==id4
drop id1 id2 id3 id4
replace country_row = "GB" if country_row == "GB-GBN"
replace country_col = "GB" if country_col == "GB-GBN"
ssc install geodist
geodist latitude_row longitude_row latitude_col  longitude_col, gen(region_dist)
geodist CapitalLatitude_row CapitalLongitude_row CapitalLatitude_col  CapitalLongitude_col, gen(country_dist)
label var region_dist "Dist. between regions"
label var country_dist "Dist. between countries"

save "data_to_use.dta", replace
saveold "data/data_to_use_EU.dta", replace


* Figure 8
use "data_to_use.dta", clear
keep if wave==4

kdensity cult_mdist if same==1, addplot(kdensity cult_mdist if same==0, lw(medthick) lp(solid)) ///
legend(lab(1 "Same country") label(2 "Different countries")) lw(medthick) /// 
lp(dash) title("Cultural distance") note("$labellist set of cultural variables. Wave 4.") nodraw
graph save "fig1", replace

kdensity res_cult_mdist if same==1, addplot(kdensity res_cult_mdist if same==0, lw(medthick) lp(solid)) ///
legend(lab(1 "Same country") label(2 "Different countries")) lw(medthick) ///
lp(dash) title("Distance in residuals of culture") note("$labellist set of cultural variables. Wave 4.") nodraw
graph save "fig2", replace

graph combine fig1.gph fig2.gph, cols(2) title("Culture: Within vs Between Country")
qui graph export "section4/figure_8.png", width(3000) height(2000) replace
***

* Figure 9
kdensity cult_mdist if same==1&country_row=="TR", addplot(kdensity cult_mdist if (same==0&country_row=="TR"|same==0&country_col=="TR"), lw(medthick) lp(solid)) ///
legend(lab(1 "Turkey") label(2 "Turkey & EU countries")) lw(medthick) /// 
lp(dash) title("Cultural distance") note("$labellist set of cultural variables. Wave 4.")
graph save "fig1_Tk", replace

kdensity res_cult_mdist if same==1&country_row=="TR", addplot(kdensity res_cult_mdist if (same==0&country_row=="TR"|same==0&country_col=="TR"), lw(medthick) lp(solid)) ///
legend(lab(1 "Turkey") label(2 "Turkey & EU countries")) lw(medthick) ///
lp(dash) title("Distance in residuals of culture") note("$labellist set of cultural variables. Wave 4.")
graph save "fig2_Tk", replace

graph combine fig1_Tk.gph fig2_Tk.gph, cols(2) title("Turkey and Europe")
qui graph export "section4/figure_9.png", width(3000) height(2000) replace
***

* Figure 10
use "data_to_use.dta", clear
keep if wave==4
foreach x in 0 1 {
ivreg2 cult_mdist cov_mdist if same==`x', cluster(country_row country_col)
local b0 = round(_b[_cons],.01)
local b1 = round(_b[cov_mdist],.001)
local b0 = string(`b0')
global regression`x' "y = 0`b0' + 0`b1' * x"
predict hat`x' 		if e(sample)
predict s`x' , stdp     
generate low`x' = hat`x' - 1.96*s`x'
generate hi`x'  = hat`x' + 1.96*s`x'
}
twoway (rarea low0 hi0 cov_mdist , sort color(gs14) ) (line hat0 cov_mdist, sort lc(black) ylab(0.5(0.02)0.62) text(0.62 0.2 "$regression0"))  ///
   (rarea low1 hi1 cov_mdist , sort color(gs14) ) (line hat1 cov_mdist, sort lc(black) lp(dash) text(0.55 0.6 "$regression1")) ///
   , xtitle("Economic Distance") ytitle("Cultural Distance") title("Differences in economic vs cultural dimensions") subtitle("Within and cross country") ///
   note("Full set of socioeconomic variables. Wave 4.") ///
   legend(order(2 4) lab(4 "Same country") label(2 "Diff. countries"))
graph export "section4/figure_10.png", width(3000) height(2000) replace
***

* Figure 11
use "data_to_use.dta", clear
keep if wave==4
replace region_dist = region_dist / 10000

foreach x in 0 1 {
ivreg2 cult_mdist region_dist if same==`x', cluster(country_row country_col)
local b0 = round(_b[_cons],.001)
local b1 = round(_b[region_dis],.001)
local b0 = string(`b0')
global regression`x' "y = 0`b0' + 0`b1' * x"
predict hat`x' 		if e(sample)
predict s`x' , stdp     
generate low`x' = hat`x' - 1.96*s`x'
generate hi`x'  = hat`x' + 1.96*s`x'
}
preserve
keep if same==1
twoway (rarea low1 hi1 region_dis , sort color(gs14) ) (line hat1 region_dis, sort lc(black) legend(off) lp(solid) /*xlab(0(0.02)0.1)*/ text(0.55 0.1 "$regression1")), ///
	xtitle("Geographic Distance between regions (10k Kms)") ytitle("Cultural Distance") title("Within Country")
qui graph save fig5_4a, replace
restore

twoway (rarea low0 hi0 region_dis , sort color(gs14) ) (line hat0 region_dis, sort lc(black) legend(off)  /*ylab(0.5(0.02)0.62)*/ text(0.55 0.3 "$regression0")),  ///
	xtitle("Geographic Distance between regions (10k Kms)") ytitle("Cultural Distance") title("Cross Country")
qui graph save fig5_4b, replace

graph combine fig5_4a.gph fig5_4b.gph, c(2) ycommon ysize(4.4) xsize(11) title("Differences in geographic vs cultural dimensions") note("Full set of socioeconomic variables. Wave 4.")
qui graph export "section4/figure_11.png", width(3000) replace
***


* Table 1
use "data_to_use.dta", clear
keep if wave==4
global country3 "DE FR IT NL ES GB"
global country4 "BE DE DK ES FR IE IT NL NO SE GB"
global country5 "AT BE DE DK ES FI FR GR IE IT LU NL NO PT SE GB"

global i = 1
mat B = J(16,17,0)
foreach countryi of global country5 {
global j = 1
foreach countryj of global country5 {
	
	*sca b1 = _b[cov_mdist]
	qui reg cult_mdist cov_mdist if ((country_row=="`countryi'" & country_col=="`countryj'")|(country_col=="`countryi'" & country_row=="`countryj'"))
	sca a2 = _b[_cons]
	mat B[$i,$j] = a2
	*
	global j = $j+1
}
qui reg cult_mdist cov_mdist if country_row=="`countryi'"| country_col=="`countryi'" &same==0
sca a3 = _b[_cons]
mat B[$i, 17] = a3
global i = $i+1
}
mat rown B = $country5
mat coln B = $country5 "All EU"
mata: B = st_matrix("B")

* Output graphs & tables
mata:
	dh = _docx_new()
	_docx_set_font(dh, "Garamond")
	_docx_set_size(dh, 15)
end

mata: e = _docx_paragraph_new(dh,"")
mata: e = _docx_paragraph_add_text(dh, "Table: Avg. cultural distance between row & column individuals of identical socioeconomic level.")
mata: e = _docx_set_size(dh, 30)
mata: e = _docx_text_set_bold(dh, 1)
mata: e = _docx_paragraph_new(dh,"")
mata: e = _docx_set_size(dh, 15)
mata: e = _docx_add_matrix(dh, "B", "%10.2f", 1, 1)
mata: e = _docx_add_pagebreak(dh)

mata:
_docx_save(dh, "section4/table1.docx",1)
_docx_close(dh)
end

* Housekeeping
erase "data_to_use.dta"    

**Housekeeping
foreach file in tmp1c.dta tmp1r.dta tmp_x1.dta tmp_x2.dta tmp_x3.dta tmp_x4.dta ///
	tmp_y1.dta tmp_y2.dta tmp_y3.dta tmp_y4.dta tmp_eY1.dta tmp_eY2.dta         ///
	tmp_eY3.dta tmp_eY4.dta tmp_eX3.dta tmp_eX4.dta rtmp.dta tmp_eX2.dta        ///
	tmp_resy1.dta tmp_resy2.dta tmp_resy3.dta tmp_resy4.dta tmp0.dta            ///
	tmp_eY.dta tmp_y.dta tmp_eX.dta tmp_x.dta tmp_resy.dta 						///
	fig1.gph fig2.gph fig1_Tk.gph fig2_Tk.gph fig5_4a.gph fig5_4b.gph 					///
	tmp_resyi.dta tmp_resyi4.dta tmp_resyt.dta tmp_resyt4.dta tmp_xi.dta 		///
	tmp_xi4.dta tmp_xt.dta tmp_xt4.dta data_to_use.dta		{ 
	capture erase `file'
}
